import os
import re
import json
import pandas as pd
import time
from pathlib import Path
from collections import defaultdict, Counter
from typing import List, Dict, Tuple, Set

# 添加NLTK相关导入
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords

# 导入翻译工具
from .translate_utils import translate_with_llm
from .concurrent_translate import translate_sentence_batch

# 在导入NLTK相关模块后，修改下载部分

# 下载必要的NLTK数据
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

try:
    nltk.data.find('taggers/averaged_perceptron_tagger_eng')
except LookupError:
    nltk.download('averaged_perceptron_tagger_eng')


class SentenceQualityEvaluator:
    """句子质量评估器"""

    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        # 基础词汇列表（假设儿童已知的常见词汇）
        self.basic_vocabulary = {
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
            'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
            'my', 'your', 'his', 'her', 'its', 'our', 'their', 'this', 'that', 'these', 'those',
            'is', 'am', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
            'do', 'does', 'did', 'will', 'would', 'can', 'could', 'should', 'may', 'might',
            'go', 'come', 'get', 'put', 'take', 'give', 'make', 'see', 'look', 'like', 'want',
            'good', 'bad', 'big', 'small', 'new', 'old', 'happy', 'sad', 'nice', 'little'
        }

        # 动词短语词典（用于区分介词和短语动词）
        self.phrasal_verbs = {
            'look after', 'look at', 'look for', 'look up', 'look out',
            'get up', 'get on', 'get off', 'get in', 'get out',
            'put on', 'put off', 'put up', 'put down',
            'turn on', 'turn off', 'turn up', 'turn down'
        }

        # 对白标识词（通常出现在对白中的词汇）
        self.dialogue_indicators = {
            'i', 'you', 'me', 'my', 'your', 'let', 'please', 'hello', 'hi', 'bye', 'goodbye',
            'yes', 'no', 'ok', 'okay', 'well', 'oh', 'ah', 'wow', 'hey'
        }

    def evaluate_completeness(self, sentence: str) -> float:
        """评估内容完整性 (规则1) - 满分20分"""
        score = 0

        # 检查是否是完整句子（有主语和谓语）
        tokens = word_tokenize(sentence.lower())
        pos_tags = pos_tag(tokens)

        has_subject = any(tag.startswith('PRP') or tag.startswith('NN') for _, tag in pos_tags)
        has_verb = any(tag.startswith('VB') for _, tag in pos_tags)

        if has_subject and has_verb:
            score += 10
        elif has_subject or has_verb:
            score += 5

        # 检查句子长度（太短可能不完整）
        if len(tokens) >= 4:
            score += 5

        # 检查是否以句号、问号或感叹号结尾
        if sentence.strip().endswith(('.', '!', '?')):
            score += 5

        return min(score, 20)

    def evaluate_length(self, sentence: str) -> float:
        """评估句子长度 (规则2) - 满分15分"""
        tokens = word_tokenize(sentence)
        length = len(tokens)

        if 4 <= length <= 15:
            # 最佳长度范围，给满分
            return 15
        elif length < 4:
            # 太短，按比例扣分
            return max(0, length * 3.75)  # 4个词=15分
        else:
            # 太长，按比例扣分
            excess = length - 15
            penalty = min(excess * 1.5, 15)  # 每超出1个词扣1.5分
            return max(0, 15 - penalty)

    def evaluate_word_clarity(self, sentence: str, target_word: str) -> float:
        """评估核心词义清晰度 (规则3) - 满分20分"""
        score = 0
        sentence_lower = sentence.lower()
        target_lower = target_word.lower()

        # 检查目标词是否作为独立单词出现
        words = word_tokenize(sentence_lower)
        if target_lower in words:
            score += 10

        # 检查是否在短语动词中（如果目标是介词，在短语动词中出现会降分）
        for phrasal_verb in self.phrasal_verbs:
            if phrasal_verb in sentence_lower and target_lower in phrasal_verb.split():
                # 如果目标词是介词，但出现在短语动词中，扣分
                if target_lower in ['after', 'up', 'on', 'off', 'out', 'in', 'at', 'for']:
                    score -= 5
                break
        else:
            # 目标词不在短语动词中，如果是介词则加分
            if target_lower in ['after', 'up', 'on', 'off', 'out', 'in', 'at', 'for']:
                score += 5

        # 检查词汇在句子中的重要性（不是停用词）
        if target_lower not in self.stop_words:
            score += 5

        return min(score, 20)

    def evaluate_difficulty(self, sentence: str, target_word: str) -> float:
        """评估难度适中性 (规则4) - 满分15分"""
        tokens = word_tokenize(sentence.lower())

        # 计算非基础词汇数量（排除目标词）
        difficult_words = 0
        for token in tokens:
            if (token.isalpha() and
                    token not in self.basic_vocabulary and
                    token != target_word.lower() and
                    len(token) > 2):
                difficult_words += 1

        # 基于难词数量评分
        if difficult_words <= 2:
            difficulty_score = 10
        elif difficult_words <= 4:
            difficulty_score = 7
        elif difficult_words <= 6:
            difficulty_score = 4
        else:
            difficulty_score = 0

        # 检查句法复杂度
        pos_tags = pos_tag(tokens)
        complex_structures = 0

        # 检查复杂结构标志
        for token, tag in pos_tags:
            if token in ['that', 'which', 'who', 'when', 'where', 'because', 'although']:
                complex_structures += 1

        syntax_score = max(0, 5 - complex_structures)

        return difficulty_score + syntax_score

    def evaluate_positivity(self, sentence: str) -> float:
        """评估内容正向性与关联性 (规则5) - 满分10分"""
        sentence_lower = sentence.lower()

        # 积极词汇
        positive_words = {
            'love', 'like', 'happy', 'good', 'great', 'wonderful', 'nice', 'fun', 'play',
            'laugh', 'smile', 'enjoy', 'beautiful', 'lovely', 'best', 'favorite'
        }

        # 消极词汇
        negative_words = {
            'hate', 'angry', 'bad', 'terrible', 'awful', 'sad', 'cry', 'hurt', 'pain',
            'scary', 'afraid', 'worried', 'sick', 'broken'
        }

        # 儿童相关词汇
        child_related = {
            'play', 'game', 'toy', 'family', 'mummy', 'daddy', 'brother', 'sister',
            'school', 'friend', 'home', 'garden', 'park', 'birthday', 'cake'
        }

        score = 5  # 基础分

        # 检查积极词汇
        for word in positive_words:
            if word in sentence_lower:
                score += 2
                break

        # 检查消极词汇
        for word in negative_words:
            if word in sentence_lower:
                score -= 3
                break

        # 检查儿童相关性
        for word in child_related:
            if word in sentence_lower:
                score += 3
                break

        return min(max(score, 0), 10)

    def evaluate_diversity(self, sentence: str, existing_patterns: Set[str]) -> float:
        """评估句式多样性 (规则6) - 满分10分"""
        # 简化的句式模式识别
        tokens = word_tokenize(sentence.lower())
        pos_tags = pos_tag(tokens)

        # 提取句式模式（简化版）
        pattern = []
        for token, tag in pos_tags[:5]:  # 只看前5个词的模式
            if tag.startswith('VB'):
                pattern.append('VERB')
            elif tag.startswith('NN'):
                pattern.append('NOUN')
            elif tag.startswith('PRP'):
                pattern.append('PRON')
            elif tag.startswith('JJ'):
                pattern.append('ADJ')
            elif token in ['the', 'a', 'an']:
                pattern.append('DET')

        pattern_str = '-'.join(pattern)

        # 如果是新模式，给高分
        if pattern_str not in existing_patterns:
            existing_patterns.add(pattern_str)
            return 10
        else:
            return 3  # 重复模式给低分

    def evaluate_dialogue_priority(self, sentence: str) -> float:
        """评估对白优先原则 (规则7) - 满分5分"""
        sentence_lower = sentence.lower()

        # 检查对白标识
        dialogue_score = 0
        for indicator in self.dialogue_indicators:
            if indicator in word_tokenize(sentence_lower):
                dialogue_score += 1

        # 检查引号或对话标点
        if '"' in sentence or "'" in sentence:
            dialogue_score += 2

        # 检查疑问句或感叹句
        if sentence.strip().endswith(('?', '!')):
            dialogue_score += 1

        return min(dialogue_score, 5)

    def evaluate_contextual_richness(self, sentence: str) -> float:
        """评估情景丰富度 (规则8) - 满分5分"""
        tokens = word_tokenize(sentence.lower())

        # 计算信息密度
        content_words = [token for token in tokens
                         if token.isalpha() and token not in self.stop_words]

        # 检查描述性元素
        pos_tags = pos_tag(tokens)
        descriptive_elements = 0

        for token, tag in pos_tags:
            if tag.startswith('JJ'):  # 形容词
                descriptive_elements += 1
            elif tag.startswith('RB'):  # 副词
                descriptive_elements += 1

        # 基于内容词数量和描述性元素评分
        richness_score = min(len(content_words) * 0.5 + descriptive_elements, 5)

        return richness_score

    def calculate_total_score(self, sentence: str, target_word: str,
                              existing_patterns: Set[str]) -> Dict[str, float]:
        """计算句子的总质量分数"""
        scores = {
            'completeness': self.evaluate_completeness(sentence),
            'length': self.evaluate_length(sentence),
            'word_clarity': self.evaluate_word_clarity(sentence, target_word),
            'difficulty': self.evaluate_difficulty(sentence, target_word),
            'positivity': self.evaluate_positivity(sentence),
            'diversity': self.evaluate_diversity(sentence, existing_patterns),
            'dialogue': self.evaluate_dialogue_priority(sentence),
            'richness': self.evaluate_contextual_richness(sentence)
        }

        # 计算加权总分
        weights = {
            'completeness': 0.20,
            'length': 0.15,
            'word_clarity': 0.20,
            'difficulty': 0.15,
            'positivity': 0.10,
            'diversity': 0.10,
            'dialogue': 0.05,
            'richness': 0.05
        }

        total_score = sum(scores[key] * weights[key] for key in scores)
        scores['total'] = total_score

        return scores


def time_str_to_ms(time_str):
    """将时间字符串转换为毫秒"""
    # 处理可能带有引号的情况
    if isinstance(time_str, str) and time_str.startswith('"') and time_str.endswith('"'):
        time_str = time_str[1:-1]

    # 处理可能包含 --> 分隔符的情况
    if isinstance(time_str, str):
        if '-->' in time_str:
            time_str = time_str.split('-->')[0].strip()
        elif ' --> ' in time_str:
            time_str = time_str.split(' --> ')[0].strip()

    # 兼容两种分隔符：逗号和点号
    if isinstance(time_str, str):
        if ',' in time_str:
            hh_mm_ss, ms = time_str.split(',')
        elif '.' in time_str:
            hh_mm_ss, ms = time_str.split('.')
        else:
            # 如果没有毫秒部分，默认为0
            hh_mm_ss = time_str
            ms = '0'

        h, m, s = hh_mm_ss.split(':')
        total_ms = int(h) * 3600 * 1000 + int(m) * 60 * 1000 + int(s) * 1000 + int(ms)
        return total_ms
    else:
        raise TypeError(f"Expected string or bytes-like object, got {type(time_str)}")


def _is_too_similar(new_sentence: str, existing_sentences: List[Dict]) -> bool:
    """检查新句子是否与已选句子过于相似"""
    if not existing_sentences:
        return False

    new_words = set(word_tokenize(new_sentence.lower()))

    for existing in existing_sentences:
        existing_words = set(word_tokenize(existing['sentence'].lower()))

        # 计算词汇重叠度
        overlap = len(new_words & existing_words) / len(new_words | existing_words)

        # 如果重叠度超过70%，认为过于相似
        if overlap > 0.7:
            return True

    return False

def generate_word_dataset(word_list_path, ip_data_dir, max_sentences_per_word=7):
    """根据单词列表从字幕中匹配单词并生成数据集

    参数:
        word_list_path: 单词列表文件路径
        ip_data_dir: IP动画数据目录
        max_sentences_per_word: 每个单词最多选择的例句数量
    返回:
        str: 生成的数据集文件路径
    """
    # 读取单词列表
    with open(word_list_path, 'r', encoding='utf-8') as f:
        word_list = [line.strip() for line in f if line.strip()]
    output_dir = str(Path(ip_data_dir).parent)

    print(f"成功读取单词列表，共{len(word_list)}个单词")

    # 查找字幕图片关联CSV文件
    subtitles_images_path = os.path.join(ip_data_dir, "subtitles_images.csv")

    if not os.path.exists(subtitles_images_path):
        raise FileNotFoundError(f"未找到字幕图片关联文件：{subtitles_images_path}")
    else:
        print(f"使用字幕图片关联文件：{subtitles_images_path}")
        df = pd.read_csv(subtitles_images_path)
        # 从CSV文件中获取IP ID
        if 'ip_id' in df.columns:
            ip_id = df['ip_id'].iloc[0]
        else:
            raise KeyError("CSV文件中缺少ip_id列")
    print(f"成功读取字幕数据，共{len(df)}行")

    result_data = {}
    evaluator = SentenceQualityEvaluator()
    # 处理每个单词
    for word in word_list:
        print(f"处理单词：{word}")
        all_matches = []

        # 遍历字幕行
        for _, row in df.iterrows():
            try:
                timestamp = row['timestamp']
                sentence = row['sentence']

                sentence = str(sentence)
                if not re.search(r'[a-zA-Z]', sentence):
                    continue

                # 提取英文部分（如果有中英文混合）
                english_text = re.sub(r'[^a-zA-Z0-9\s\.,!?\'\"\-]', '', sentence)

                # 清理文本，移除标点符号，转为小写
                cleaned_text = re.sub(r'[^\w\s]', ' ', english_text.lower())
                words_in_text = cleaned_text.split()

                # 跳过少于4个单词的句子
                if len(words_in_text) < 4:
                    continue

                # 全词匹配（大小写不敏感）
                if word.lower() in [w.lower() for w in words_in_text]:
                    # 解析时间戳，获取对应的帧图片路径
                    start_time, end_time = timestamp.strip('"').split('-->')

                    # 提取毫秒时间戳
                    start_ms = time_str_to_ms(start_time.strip())

                    # 构建帧图片路径
                    if 'image_id' in df.columns:
                        image_id = row['image_id']
                        frame_path = f"images/original/{ip_id}/{image_id}.jpg"
                        if not os.path.exists(os.path.join(output_dir, frame_path)):
                            raise Exception(f"未找到帧图片: {os.path.join(output_dir, frame_path)}")
                    else:
                        raise KeyError("CSV文件中缺少image_id列")

                    if 'audio_id' in df.columns:
                        audio_id = row['audio_id']
                        audio_path = f"audio/original/{ip_id}/{audio_id}.mp3"
                        if not os.path.exists(os.path.join(output_dir, audio_path)):
                            raise Exception(f"未找到音频文件: {os.path.join(output_dir, audio_path)}")
                    else:
                        raise KeyError("CSV文件中缺少audio_id列")

                    # 添加匹配结果
                    all_matches.append({
                        "sentence": sentence,
                        "timestamp": timestamp,
                        "frame_path": "http://159.138.23.13/" + frame_path,
                        "audio_path": "http://159.138.23.13/" + audio_path,
                        "image_id": image_id,
                        "audio_id": audio_id,
                        "ip_id": ip_id
                    })
            except Exception as e:
                print(f"e")
                continue

        # 如果有匹配结果，进行筛选
        if all_matches:
            print(f"单词'{word}'匹配到{len(all_matches)}个结果，开始筛选...")

            # 为每个句子计算质量分数
            word_patterns = set()  # 记录该单词已使用的句式模式
            scored_matches = []

            for match in all_matches:
                sentence_text = match['sentence']
                scores = evaluator.calculate_total_score(sentence_text, word, word_patterns)

                match['scores'] = scores
                match['total_score'] = scores['total']
                scored_matches.append(match)

            # 按总分排序
            scored_matches.sort(key=lambda x: x['total_score'], reverse=True)
            

            # 选择最佳句子并准备批量翻译
            translation_candidates = []
            for match in scored_matches[:max_sentences_per_word * 2]:
                sentence_text = match['sentence']

                # 避免选择过于相似的句子
                if _is_too_similar(sentence_text, translation_candidates):
                    continue
                
                # 准备翻译数据
                translation_item = match.copy()
                del translation_item['scores']
                del translation_item['total_score']
                translation_item['word'] = word  # 添加单词信息用于翻译
                translation_candidates.append(translation_item)
            
            # 使用并发翻译处理所有候选句子
            try:
                translated_matches = translate_sentence_batch(translation_candidates, max_workers=10)
            except Exception as e:
                print(f"并发翻译时出错: {e}")
                continue
            # 过滤掉翻译失败的结果
            selected_matches = [match for match in translated_matches if match.get('word_meaning') and match.get('sentence_zh')][:max_sentences_per_word]

            result_data[word] = selected_matches
            print(f"单词'{word}'筛选后保留{len(selected_matches)}个高质量例句")
        else:
            print(f"单词'{word}'没有匹配到任何结果")

    # 保存结果为JSON文件
    output_file = Path(ip_data_dir) / f"word_dataset_{Path(ip_data_dir).name}.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(result_data, f, ensure_ascii=False, indent=2)

    print(f"成功生成单词数据集，共{len(result_data)}个单词有匹配结果")
    print(f"结果已保存到：{output_file}")

    return output_file
